library(tidyverse)
library(reshape2)
library(ggeasy)

net_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/"
expression_dir = "/pastel/projects/speakeasy_dlpfc/"
work_dir = "/pastel/Github_scripts/SpeakEasy_dlpfc/bulk_dlpfc/"

Top 5 hub genes

Hub genes are the most connected genes in a module so, here we list the top ones by module.

#############################
# Edges table
#############################

################################## Variables to set
# edge_weight2plot = 0.01 # Filter the edges by this value. This is the fraction of the top edge correlations. 0.1 = top 10%.
edge_weight_1 = 1 # We don't wanna plot the correlations = 1 
correlation_method = "pearson"
################################## Done

# Expression data for a single set:
load(paste0(expression_dir, "exprdata_byregion.Rdata")) 
expr_matx = as.data.frame(exprData_MF) # Residuals of the expression

gene_modules = read.table(paste0(net_dir, "geneBycluster.txt"), header = T, stringsAsFactors = F) # clusters from SpeakEasy
gene_modules_size = gene_modules %>% group_by(cluster_lv3) %>% summarise(n = n()) %>% filter(n >= 30)
n_modules = nrow(gene_modules_size)
  
nodes_df = data.frame()
edges_df = data.frame()

for(i in 1:n_modules){
 # i=1
  mod2plot = gene_modules_size$cluster_lv3[i]
  modSize = gene_modules_size$n[i]
    
  # Select the expression values for the module of interest 
  to_plot = gene_modules$ensembl[gene_modules$cluster_lv3 == mod2plot]
  expr_matx_mod = expr_matx[to_plot, ]
    
  #Let's calculate the correlation matrix
  matx_cor = cor(t(expr_matx_mod), method = correlation_method) # must be gene by gene
  matx_cor_m = melt(matx_cor)
  colnames(matx_cor_m) = c("ensembl1", "ensembl2", "value")
   
  matx_cor_temp = matx_cor_m %>% left_join(gene_modules[, c("ensembl", "symbol")], by=c("ensembl1" = "ensembl"))
  colnames(matx_cor_temp) = c("ensembl1", "ensembl2", "value", "from_node")
    
  # Complete correlation matrix 
  matx_cor_temp2 = matx_cor_temp %>% left_join(gene_modules[, c("ensembl", "symbol")], by=c("ensembl2" = "ensembl"))
  colnames(matx_cor_temp2) = c("ensembl1", "ensembl2", "value", "from_node", "to_node")
    
  # Filter the edges based on correlation (correlation is used as weight here)
  edges_before_filtering = matx_cor_temp2[which(abs(matx_cor_temp2$value) != edge_weight_1), ]
    
  #order_of_edges = order(abs(edges_before_filtering$value), decreasing = T)
  #order_of_edges_filt = order_of_edges[1:round(length(order_of_edges)*edge_weight2plot)]
  #edges_filtered = edges_before_filtering[order_of_edges_filt,]
    
  # edges2keep = which(edges_before_filtering$value >= (median(abs(edges_before_filtering$value)))) # Get the edges > than the median  
  edges2keep = which(edges_before_filtering$value >= quantile(abs(edges_before_filtering$value), 3/4)) # Get the edges on the 3/4 quantiles 
  edges_filtered = edges_before_filtering[edges2keep,]
    
  colnames(edges_filtered) = c("fromNode", "toNode", "weight", "fromAltName", "toAltName") # match names for Cytoscape input
    
  # Remove NAs and duplicated ensembls 
  #edges_filtered = na.omit(edges_filtered)
  #edges_filtered = edges_filtered[! duplicated(edges_filtered), ]
  edges_filtered_df = edges_filtered
  edges_filtered_df$module = mod2plot
  edges_filtered_df$region = "MF"
    
  edges_df = rbind(edges_df, edges_filtered_df) # Filtered edges table with all modules from all cell types 
    
  #############################
  # Nodes table 
  #############################
  nodes_table = gene_modules[ , c("ensembl", "cluster_lv3", "symbol")]
  colnames(nodes_table) = c("nodeName", "nodeAttr", "altName") # Match for Cytoscape input 
  nodes_table = nodes_table[nodes_table$nodeAttr == mod2plot, ] # Only for the module of interest
    
  # Apply same filter to the nodes table
  nodes_filtered = nodes_table[nodes_table$nodeName %in% c(edges_filtered$fromNode,edges_filtered$toNode), ]
    
  # Add number of node connections in the filtered network to get the hub nodes
  nodes_filtered = inner_join(nodes_filtered, 
                              bind_rows(edges_filtered %>% group_by(fromNode) %>% dplyr::count() %>% dplyr::rename(nodeName = fromNode), 
                                        edges_filtered %>% group_by(toNode) %>% dplyr::count() %>% dplyr::rename(nodeName = toNode)) %>% 
                                group_by(nodeName) %>% distinct() %>% tally(wt = n))
    
  hub_genes_list = nodes_filtered[order(nodes_filtered$n, decreasing = T), ]$altName[1:5] # Hubs are the most connected nodes in a module (after filtering the edges)
    
  nodes_filtered_df = nodes_filtered[order(nodes_filtered$n, decreasing = T),]
  nodes_filtered_df$module_size = modSize
  nodes_filtered_df$region = "MF"
  nodes_df = rbind(nodes_df, nodes_filtered_df) # Filtered nodes table with all modules from all cell types 
}

# Top hub genes 
nodes_df_top = nodes_df %>% 
  group_by(region, module_size) %>% 
  arrange(-n) %>% 
  slice_head(n = 5)

write.table(nodes_df_top, file = paste0(work_dir, "hub_genes_MF.txt"), sep = "\t", quote = F, row.names = F)

createDT(nodes_df_top)
save(nodes_df, file = paste0(work_dir, "n_connections_MF.Rdata"))
# nodes_df %>% group_by(celltype,module_size) %>% summarise(n_genes = n()) %>% arrange(n_genes)

Plot connections

We want to check if the network has a scale-free topology.

Scale-free: many nodes with only a few links. A few hubs with large number of links.

Random: most nodes have the same number of links. No highly connected nodes resulting in a Poison distribution.

ggplot(nodes_df, aes(x=n)) +
  geom_histogram(bins = 100, color="black", fill="white") +
  easy_labs(x = "Number of links (k)", y = "Number of nodes with k links") +
  theme_classic()

Session

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeasy_0.1.3    reshape2_1.4.4  forcats_0.5.1   stringr_1.5.0  
##  [5] dplyr_1.1.3     purrr_1.0.2     readr_2.1.4     tidyr_1.3.0    
##  [9] tibble_3.2.1    ggplot2_3.4.4   tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.11       lubridate_1.8.0   digest_0.6.33     utf8_1.2.4       
##  [5] R6_2.5.1          cellranger_1.1.0  plyr_1.8.9        backports_1.4.1  
##  [9] reprex_2.0.1      evaluate_0.23     highr_0.10        httr_1.4.7       
## [13] pillar_1.9.0      rlang_1.1.2       readxl_1.3.1      rstudioapi_0.15.0
## [17] jquerylib_0.1.4   DT_0.30           rmarkdown_2.25    labeling_0.4.3   
## [21] htmlwidgets_1.6.2 munsell_0.5.0     broom_1.0.5       compiler_4.1.2   
## [25] modelr_0.1.8      xfun_0.41         pkgconfig_2.0.3   htmltools_0.5.7  
## [29] tidyselect_1.2.0  fansi_1.0.5       crayon_1.5.2      tzdb_0.4.0       
## [33] dbplyr_2.4.0      withr_2.5.2       grid_4.1.2        jsonlite_1.8.7   
## [37] gtable_0.3.4      lifecycle_1.0.3   DBI_1.1.3         magrittr_2.0.3   
## [41] scales_1.2.1      cli_3.6.1         stringi_1.7.12    cachem_1.0.8     
## [45] farver_2.1.1      fs_1.6.3          xml2_1.3.5        bslib_0.5.1      
## [49] ellipsis_0.3.2    generics_0.1.3    vctrs_0.6.4       tools_4.1.2      
## [53] glue_1.6.2        hms_1.1.3         crosstalk_1.2.0   fastmap_1.1.1    
## [57] yaml_2.3.7        colorspace_2.1-0  rvest_1.0.2       knitr_1.45       
## [61] haven_2.4.3       sass_0.4.7